#This is an example of two variables where
#  we think that there may be a linear relation 
#  between the values in L1 and the values in L2
source("../gnrnd4.R")
gnrnd4(489031506,3110420504,1900005)
#   look at the values

L1
L2

# it might be easier if we remember that the L1
# values are our  x  values 
x <- L1
#  the L2 values are our y values.  Later we will have
#  some other y-values so let us call these the 
#  y_observed values
y_observed <- L2

# it is hard to see a relation just by looking at the
#  values.  Perhaps we should look at the graph of the two

plot( x, y_observed )

#  Now, just for fun, find the mean value for each variable
x_bar <- mean( x )
x_bar
y_observed_bar <- mean( y_observed )
y_observed_bar
#  and let us add that to the graph
points( x_bar, y_observed_bar, pch=19)
#
#  actually, just so that we can get a better picture
#  let us at least try to make the graph somewhat
#  square and let us get the origin into the graph.

plot( x, y_observed, xlim=c(-5,45), ylim=c(-5,45),
      las=1, xaxp=c(-5,45, 10), yaxp=c(-5,45, 10))
points( x_bar, y_observed_bar, pch=19)
abline(h=seq(-5,45,5), v=seq(-5,45,5),
       lty="dotted", col="gray")
abline(h=0, v=0)

#  It looks like a good line that approximates the
#  plotted points would be one going through
#  the points (0,5) and (25, 45).  From our algebra 
#  background that would be a line with y-intercept
# (0,5) and a slope of 40/25=8/5.  Let us draw this 
# line on the graph.
abline(5, 8/5, col="red" )
#
#  How good of a fit is that line?
#  To answer this we need to find the values on 
#  the line that correspond to each of our x values.
#  We will call them the y_expected values.
y_expected <- 5 + (8/5)*x
#  And we could look at the values
y_expected
#  And we could even plot the values
points(x, y_expected, pch=17, col="red")

#  This is a bit too crowded.  Let us go back to a
#  closer view of the same stuff
plot( x, y_observed, xlim=c(10,26), ylim=c(20,44),
      las=1, xaxp=c(10,26, 8), yaxp=c(20,44, 12))
points( x_bar, y_observed_bar, pch=19)
abline(h=seq(20,46,2), v=seq(10,26,2),
       lty="dotted", col="gray")
abline(5, 8/5, col="red" )
points(x, y_expected, pch=17, col="red")
#
#  To get a measure of the fit we first find all of the
#  observed - expected values
x             # just to see them again
y_observed    # just to see them again
y_expected    # just to see them again
difference <- y_observed - y_expected
difference
#     Then get the squares of those values
diff_sqr <- difference^2
diff_sqr
#     Now get the sum of those squared values
sum( diff_sqr )

#   let us try a different line, and we will see if
#   it is a better fit
y_expected_2 <- 10 + 1.25*x
y_expected_2
abline( 10, 1.25, col="green")
difference <- y_observed - y_expected_2
difference
# and we can get the sum of the differences squared
sum( difference^2)
#   that sum is less than the first line so 
#   this line is a better fit.
#
#  Now the challenge is to find the best line,
#  the line with the best fit possible.

#   R has a function to do that , the lm() function
lm( y_observed ~ x )
#   plot that line
abline( 9.127, 1.340, col="black")
#   let us see the "goodness" of that model
y_expected_3 <- 9.127 + 1.340*x
difference <- y_observed -y_expected_3
sum( difference^2)
#
#   By the way, the differences, observed - expected,
#   are called the residuals.  We want to have the
#   graph of the x values and the residuals to
#   be scattered all over the place
plot( x, difference)

#  let us return to our big graph and then
#   graph the regression equation
plot( x, y_observed, xlim=c(-5,45), ylim=c(-5,45),
      las=1, xaxp=c(-5,45, 10), yaxp=c(-5,45, 10))
points( x_bar, y_observed_bar, pch=19)
abline(h=seq(-5,45,5), v=seq(-5,45,5),
       lty="dotted", col="gray")
abline(h=0, v=0)
abline( 9.127, 1.340, col="black")
#   Note that the regression equation goes through our 
#   plot of the point( mean(x), mean(y) )